## Loading required package: digest
## Loading required package: tibble
## Loading required package: ggplot2
## Project name: 01.protein-seq-evo-v1
## Loading project configuration
## Autoloading packages
##  Loading package: dplyr
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##  Loading package: stringr
## Loading required package: stringr
##  Loading package: readr
## Loading required package: readr
##  Loading package: ggplot2
##  Loading package: tidyr
## Loading required package: tidyr
##  Loading package: patchwork
## Loading required package: patchwork
##  Loading package: gridExtra
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
##  Loading package: GGally
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
##  Loading package: readxl
## Loading required package: readxl
##  Loading package: viridis
## Loading required package: viridis
## Loading required package: viridisLite
##  Loading package: cowplot
## Loading required package: cowplot
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
## 
##     align_plots
##  Loading package: ggsignif
## Loading required package: ggsignif
##  Loading package: minpack.lm
## Loading required package: minpack.lm
##  Loading package: purrr
## Loading required package: purrr
##  Loading package: scales
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## The following object is masked from 'package:readr':
## 
##     col_factor
##  Loading package: bigsnpr
## Loading required package: bigsnpr
## Loading required package: bigstatsr
##  Loading package: drc
## Loading required package: drc
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
## 
##     area
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 'drc' has been loaded.
## Please cite R and 'drc' if used for a publication,
## for references type 'citation()' and 'citation('drc')'.
## 
## Attaching package: 'drc'
## The following objects are masked from 'package:stats':
## 
##     gaussian, getInitial
##  Loading package: data.table
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## Autoloading helper functions
##  Running helper script: globals.R
##  Running helper script: helpers.R
## Autoloading data
## Munging data
##  Running preprocessing script: 01-util.R
##  Sourcing R script: 01-util.R
##  Running preprocessing script: 02-munge_kras.R
##  Sourcing R script: 02-munge_kras.R
## Rows: 189 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): SEQ, ATOM, COLOR, CONFIDENCE INTERVAL, MSA DATA, RESIDUE VARIETY
## dbl (2): POS, SCORE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3591 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X2, X4
## dbl (1): X3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3780 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variant
## dbl (2): score, pos
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3572 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): mutant
## dbl (11): column_coverage, popEVE, pop-adjusted_ESM1v, pop-adjusted_EVH_epis...
## lgl  (3): pop-adjusted_EVE, pop-adjusted_Tranception, EVE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3780 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): wt_aa, mt_aa, ClinVar_ClinicalSignificance, Starred_Coarse_Grained...
## dbl (11): position, Gold_Stars, NumberSubmitters, frequency_gv2, frequency_g...
## lgl (14): BS1, PM2, PM5, PP5, BP6, b_model, p_model, b_acmg_model, lb_acmg_m...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2587 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): id, wt_aa, mt_aa, bp_interface, binding_RAF, binding_RAL, binding_...
## dbl (20): Pos_real, abundance_ddg, abundance_ddg_std, pik3cg_ddg, pik3cg_ddg...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3453 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): id, wt_codon
## dbl (4): Pos_real, mean_kcal/mol, std_kcal/mol, ESM1v
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##  Running preprocessing script: 03-munge_src.R
## 
##  Sourcing R script: 03-munge_src.R
## 
## Rows: 5112 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): id
## dbl (8): FL_activity_mean_kcal/mol, FL_activity_std_kcal/mol, FL_folding_mea...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 536 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): SEQ, ATOM, COLOR, CONFIDENCE INTERVAL, MSA DATA, RESIDUE VARIETY
## dbl (2): POS, SCORE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X2, X4
## dbl (1): X3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10720 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variant
## dbl (2): score, pos
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 536 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): WT_AA, domains_Pfam, KD_lobe, secondary_structure_uniprot, seconda...
## dbl  (1): position
## lgl (11): block1, block2, block3, block4, block5, R-spine, C-spine, Communit...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4898 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): id, wt_aa
## dbl (9): FL_kinase_fitness_scaled, FL_kinase_sigma_scaled, FL_abundance_fitn...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##  Running preprocessing script: 04-munge_psd95.R
## 
##  Sourcing R script: 04-munge_psd95.R
## 
## Rows: 3154 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id_eve, id_old, trait_name, library, assay, pdz_name, alignment_po...
## dbl (26): X, V1, pos_am, ddg, std_ddg, ci95_kcal.mol, pdz, structural_alignm...
## lgl  (1): binding_interface_5A
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##  Running preprocessing script: 05-munge-grb2.R
## 
##  Sourcing R script: 05-munge-grb2.R
## 
## Rows: 1056 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): id, old_id, id_ref, SS, Pos_class, protein, WT_AA, Mut, wt_aa.x, m...
## dbl (23): Pos_real, Pos_ref, Pos, mut_order, f_dg_pred, f_ddg_pred, f_ddg_pr...
## lgl  (5): f_ddg_pred_conf, b_ddg_pred_conf, allosteric, orthosteric, alloste...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1689 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): id, protein, pca_type, aa_seq, old_id, wt_aa.x, mt_aa, wt_aa.y, at...
## dbl (14): Pos_real, Nham_aa, fitness, sigma, growthrate, growthrate_sigma, c...
## lgl  (1): WT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gck_abundance <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_ddg/proteinGym/HXK4_HUMAN_Gersing_2023_abundance.csv", header = TRUE)
gck_activity <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_ddg/proteinGym/HXK4_HUMAN_Gersing_2022_activity.csv", header = TRUE)
gck_esm <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_esm1v/P35557.csv")
nrow(gck_abundance) #8396
## [1] 8396
nrow(gck_activity) #8570
## [1] 8570
nrow(gck_esm) #8835
## [1] 8835
gck_df <- merge(gck_abundance, gck_esm, by.x = "mutant", by.y = "variant") 
gck_df <- gck_df %>% dplyr::select (mutant, DMS_score, DMS_score_bin, ESM.1v) %>% 
  dplyr::rename(DMS_score_abundance = DMS_score,
                DMS_score_bin_abundance = DMS_score_bin)
gck_df <- merge(gck_df, gck_activity, by = "mutant") 
gck_df <- gck_df %>% dplyr::rename(DMS_score_activity = DMS_score,
                                   DMS_score_bin_activity = DMS_score_bin) %>% 
  dplyr::select (mutant, DMS_score_abundance, DMS_score_bin_abundance, ESM.1v,
                 DMS_score_activity, DMS_score_bin_activity)

gck_df <- gck_df %>%
  mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))
gck_clinvar <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_clinvar/P35557_clinvar_GCK.csv")
nrow(gck_clinvar) #8835
## [1] 8835
gck_df <- merge(gck_df, gck_clinvar, by.x="mutant", by.y="variant")


gck_clinvar_web <- read.delim("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_clinvar/GCK_clinvar_web_042925.txt")
nrow(gck_clinvar_web) #319
## [1] 319
table(gck_clinvar_web$Variant.type)
## 
## single nucleotide variant 
##                       319
# single nucleotide variant 
#                       319

# Map from 3-letter to 1-letter amino acid codes
aa_map <- c(
  Ala="A", Arg="R", Asn="N", Asp="D", Cys="C",
  Gln="Q", Glu="E", Gly="G", His="H", Ile="I",
  Leu="L", Lys="K", Met="M", Phe="F", Pro="P",
  Ser="S", Thr="T", Trp="W", Tyr="Y", Val="V",
  Ter="*"
)

# Extract long-form protein change (e.g., Ala456Val)
gck_clinvar_web$canonical_long <- sub(".*\\(p\\.([A-Za-z]+\\d+[A-Za-z]+)\\).*", "\\1", gck_clinvar_web$Name)

# Convert long form to short form using aa_map
gck_clinvar_web$canonical_short <- sapply(gck_clinvar_web$canonical_long, function(change) {
  if (grepl("^([A-Za-z]{3})(\\d+)([A-Za-z]{3})$", change)) {
    parts <- regmatches(change, regexec("^([A-Za-z]{3})(\\d+)([A-Za-z]{3})$", change))[[1]]
    from <- aa_map[[parts[2]]]
    to <- aa_map[[parts[4]]]
    pos <- parts[3]
    if (!is.null(from) && !is.null(to)) {
      return(paste0(from, pos, to))
    }
  }
  return(NA)
})

gck_clinvar_web <- gck_clinvar_web %>% dplyr::select( canonical_short, Condition.s., Germline.classification,Protein.change)
nrow(gck_clinvar_web) #319
## [1] 319
# Classify based on known pathogenic GCK disease mechanisms
classify_gck_conditions <- function(cond_str) {
  cond_lower <- tolower(cond_str)
  
  # Individual flags
  is_mody <- grepl("maturity[- ]onset diabetes.*young", cond_lower)
  is_pndm <- grepl("neonatal diabetes", cond_lower)
  is_hh   <- grepl("hyperinsulinism", cond_lower)
  is_mono <- grepl("monogenic diabetes", cond_lower)
  is_np   <- grepl("not provided", cond_lower)
  
  # Label assignment based on specific rules
  labels <- c()
  if (is_mody)  labels <- c(labels, "MODY")
  if (is_pndm)  labels <- c(labels, "PNDM")
  if (is_hh)    labels <- c(labels, "HH")
  if (is_mono)  labels <- c(labels, "Monogenic diabetes")
  if (is_np && length(labels) == 0) labels <- c("Not provided")  # only assign if nothing else detected
  
  if (length(labels) == 0) {
    return("Other")
  } else if (length(labels) == 1) {
    return(labels)
  } else {
    return("Mixed")
  }
}

# Apply the classification
gck_clinvar_web$clean_condition <- sapply(gck_clinvar_web$Condition.s., classify_gck_conditions)

# View cleaned summary
#table(gck_clinvar_web$clean_condition)
#HH              Mixed               MODY Monogenic diabetes       Not provided              Other 
#3                 14                 69                189                 41                  3 

gck_clinvar_web <- gck_clinvar_web %>%
  mutate(canonical_short = if_else(
    is.na(canonical_short),
    Protein.change,
    canonical_short
  ))

nrow(gck_clinvar_web) #319 3+14+69+189 = 275
## [1] 319
length(unique(gck_clinvar_web$canonical_short)) #313
## [1] 313
# Find duplicated canonical_short values
dup_idx <- duplicated(gck_clinvar_web$canonical_short) | duplicated(gck_clinvar_web$canonical_short, fromLast = TRUE)

# Extract rows with duplicated canonical_short values
gck_clinvar_dups <- gck_clinvar_web[dup_idx, ]

gck_clinvar_web <- gck_clinvar_web %>% distinct(canonical_short, .keep_all = TRUE)
nrow(gck_clinvar_web) #313
## [1] 313
gck_df_merged <- merge(gck_df, gck_clinvar_web, by.x="mutant", by.y="canonical_short", all.x = TRUE)
nrow(gck_df_merged) #8255
## [1] 8255
range(gck_df_merged$DMS_score_abundance) #-0.9834964  1.6096087
## [1] -0.9834964  1.6096087
range(gck_df_merged$DMS_score_activity) #-1.085214  6.670528
## [1] -1.085214  6.670528
gck_gnomad <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_gnomad/GCK_gnomAD_v4.1.0_ENSG00000106633_2025_04_09_14_05_39.csv")
gck_gnomad <- gck_gnomad %>% filter(VEP.Annotation == "missense_variant")
summary(gck_gnomad$Allele.Frequency)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 6.195e-07 6.197e-07 6.230e-07 1.099e-05 1.859e-06 2.580e-03
nrow(gck_gnomad) #528
## [1] 528
#      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
# 6.195e-07 6.197e-07 6.230e-07 1.076e-05 1.859e-06 2.580e-03

gck_gnomad <-  gck_gnomad %>% dplyr::select(HGVS.Consequence, ClinVar.Germline.Classification,
                                            rsIDs, ClinVar.Variation.ID)
nrow(gck_gnomad) #528
## [1] 528
# Function to convert e.g. "p.Glu2Gln" → "E2Q"
convert_to_short <- function(consequence) {
  consequence <- gsub("^p\\.", "", consequence)  # Remove 'p.'
  matches <- regmatches(consequence, regexec("([A-Za-z]{3})([0-9]+)([A-Za-z]{3})", consequence))[[1]]
  if (length(matches) == 4) {
    from <- aa_map[[matches[2]]]
    pos <- matches[3]
    to <- aa_map[[matches[4]]]
    return(paste0(from, pos, to))
  } else {
    return(NA)
  }
}

# Apply to the column
gck_gnomad$ID <- sapply(gck_gnomad$HGVS.Consequence, convert_to_short)

nrow(gck_gnomad) #528
## [1] 528
length(unique(gck_gnomad$ID)) #519
## [1] 519
gck_gnomad <- gck_gnomad %>% distinct(ID, .keep_all = TRUE)
nrow(gck_gnomad) #519
## [1] 519
nrow(gck_df_merged) #8255
## [1] 8255
gck_df <- merge(gck_df_merged, gck_gnomad, by.x="mutant", by.y = "ID", all.x = TRUE)
nrow(gck_df) #8255
## [1] 8255
length(unique(gck_df$mutant)) #8255
## [1] 8255
#https://cspec.genome.network/cspec/ui/svi/doc/GN086
active_positions <- c(151:179, # disordered loop
                      151-153, 168-169, 204-206, 225-231, 254-258, 287, 290, # glucose-binding
                      78:85, 151, 169, 205, 225:229, 295:296, 331:333, 336, 410:416 # ATP-binding
)

fil_gck_df <- gck_df %>%
  filter(!mutation_position %in% active_positions)

nrow(fil_gck_df) #7224
## [1] 7224
# Fit a loess model using the filtered data
loess_fit <- loess(DMS_score_activity ~ DMS_score_abundance, data = fil_gck_df, span = 0.7, family = "symmetric")

# Predict fitted values for ALL data points using the loess model trained on fil_gck_df
gck_df$fitted <- predict(loess_fit, newdata = gck_df)

# Calculate residuals for ALL points
gck_df$residuals <- gck_df$DMS_score_activity - gck_df$fitted
range(gck_df$residuals) #-1.891487  6.102024
## [1] -1.891487  6.102024
# Generate LOESS fit line from fil_gck_df
loess_fit <- loess(DMS_score_activity ~ DMS_score_abundance, data = fil_gck_df, span = 0.7, family = "symmetric")

fit_line_df <- data.frame(
  DMS_score_abundance = seq(-0.6,
                            max(fil_gck_df$DMS_score_abundance, na.rm = TRUE),
                            length.out = 200)
)

fit_line_df$DMS_score_activity <- predict(loess_fit, newdata = fit_line_df)

#length(unique(gck_df$mutant)) #8255

range(gck_df$residuals) #-1.891487  6.102024
## [1] -1.891487  6.102024
range(gck_df$DMS_score_abundance) #-0.9834964  1.6096087
## [1] -0.9834964  1.6096087
range(gck_df$DMS_score_activity) #-1.085214  6.670528
## [1] -1.085214  6.670528
median(gck_df$DMS_score_activity) #0.56
## [1] 0.5603297
gck_df <- gck_df %>%
  mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))

nrow(gck_df)
## [1] 8255
median_residuals <- gck_df %>%
  dplyr::group_by(mutation_position) %>%
  summarise(median_residuals = median(residuals, na.rm = TRUE))

min(median_residuals$median_residuals) #-1.136173
## [1] -1.136173
max(median_residuals$median_residuals) #2.551111
## [1] 2.551111
pdb <- read.pdb("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/GCK/1v4s.pdb")
data <- median_residuals

# Create a new B-factor vector initialized with the current B-factors from the PDB
new_b_factors <- pdb$atom$b

# Loop through each position in the correlation data and update the B-factors
for (i in 1:nrow(data)) {
  position <- data$mutation_position[i]
  correlation_value <- data$median_residuals[i]
  
  # Find indices in the PDB that match the current position
  indices <- which(pdb$atom$resno == position)
  
  # Print the indices and current B-factors before updating
  #cat("Updating residue number:", position, "\n")
  #cat("Indices in PDB:", indices, "\n")
  #cat("Current B-factors:", new_b_factors[indices], "\n")
  
  # Update B-factors for all atoms in the current residue
  new_b_factors[indices] <- correlation_value
  
  # Print the new B-factors after updating
  #cat("Updated B-factors:", new_b_factors[indices], "\n")
  #cat("\n")  # Add an extra line for readability
}
# Replace non-matching B-factors with outlier value so we can filter it out in ChimeraX
non_matching_indices <- setdiff(seq_along(new_b_factors), which(pdb$atom$resno %in% data$mutation_position))
new_b_factors[non_matching_indices] <- 999

# Assign the new B-factors back to the pdb structure
# Write the modified PDB structure to a new file
pdb$atom$b <- new_b_factors
write.pdb(pdb, file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/GCK/1v4s_median_residual_loess.pdb")
# Plot
nrow(gck_df)
## [1] 8255
p1 <- ggplot(gck_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  geom_vline(xintercept = 0.58, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_hline(yintercept = 0.66, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
            inherit.aes = FALSE, color = "black", linewidth = 1) +
  geom_text_repel(data = subset(gck_df, mutant %in% c("Y214E")),
                  aes(label = mutant),
                  size = 4,color = "gold2",
                  max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  geom_text_repel(data = subset(gck_df, mutant %in% c("T206M", "V452L")),
                  aes(label = mutant),
                  size = 4,color = "black",
                  max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  geom_point(data = subset(gck_df, mutant %in% c("T206M", "V452L")),
             aes(x = DMS_score_abundance, y = DMS_score_activity),
             shape = 17, color = "black", size = 3) +  # Triangle shape
  scale_color_viridis(option = "C", direction = 1, limits = c(-6.2, 6.2)) + 
  labs(
    title = "All 8255 Variants",
    x = "Measured abundance",
    y = "Measured activity",
    color = "LOESS residuals"
  ) +
  scale_y_continuous(breaks = seq(-1, 7, by = 2))  + xlim(-1,1.7)  +
  theme_classic() + theme(legend.position = "none") +theme(
  panel.background = element_rect(fill = "white", color = NA),
  plot.background = element_rect(fill = "white", color = NA),
  legend.background = element_rect(fill = "white", color = NA)
)

p1 <- ggMarginal(
  p1,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)

patho_df <- gck_df %>% filter(clean_condition %in% c("HH", "MODY", "Monogenic diabetes"))
nrow(patho_df) #224
## [1] 224
p2 <- ggplot(patho_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
  geom_point(size = 2, shape = 17) +
  geom_text_repel(data = subset(gck_df, mutant %in% c("T206M", "V452L")),
                  aes(label = mutant),
                  size = 4,color = "black",
                  max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  geom_point(data = subset(gck_df, mutant %in% c("T206M", "V452L")),
             aes(x = DMS_score_abundance, y = DMS_score_activity),
             shape = 17, color = "black", size = 3) +  # Triangle shape
  geom_vline(xintercept = 0.58, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_hline(yintercept = 0.66, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
            inherit.aes = FALSE, color = "black", linewidth = 1) +
  scale_color_viridis(option = "C", direction = 1, limits = c(-6.2, 6.2)) + 
  labs(
    title = "ClinVar Pathogenic Variants",
    x = "Measured abundance",
    y = "Measured activity",
    color = "Loess residuals"
  ) +scale_y_continuous(
    breaks = seq(-1, 7, by = 2),
    limits = c(-1.1, 7)
  ) +
  theme_classic()  + xlim(-1,1.7) +
  theme(legend.position = "none") 

p2 <- ggMarginal(
  p2,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)

gnomad_df <- gck_df %>% filter(!is.na(HGVS.Consequence)) %>% 
  filter(!clinvar_clinical_significance %in% c("likely_pathogenic", "pathogenic", "conflict", "likely_risk_allele", "uncertain_risk_allele", "VUS")) %>%
  filter(!ClinVar.Germline.Classification %in% c("Pathogenic", "Pathogenic/Likely pathogenic", "Likely pathogenic",
                                                 "Conflicting classifications of pathogenicity", "Likely pathogenic/Likely risk allele",
                                                 "Likely risk allele", "Pathogenic/Likely pathogenic/Likely risk allele",
                                                 "Uncertain significance", "Uncertain significance/Uncertain risk allele")) %>%
  filter(is.na(Germline.classification))
  #filter(is.na(Phenotype_Class))
  #filter(is.na(somatic))

#nrow(gnomad_df) #268


p3 <- ggplot(gnomad_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
  geom_point(size = 2) +
  geom_vline(xintercept = 0.58, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_hline(yintercept = 0.66, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
            inherit.aes = FALSE, color = "black", linewidth = 1) +
  geom_text_repel(data = subset(gck_df, mutant %in% c("A11T", "E279Q", "R46K")),
                  aes(label = mutant),
                  size = 4,color = "black",
                  max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  geom_point(data = subset(gck_df, mutant %in% c("A11T", "E279Q", "R46K")),
             aes(x = DMS_score_abundance, y = DMS_score_activity),
             color = "black", size = 2) +  # Triangle shape
  scale_color_viridis(option = "C", direction = 1, limits = c(-6.2, 6.2)) + 
  labs(
    title = "gnomAD Variants",
    x = "Measured abundance",
    y = "Measured activity",
    color = "Loess residuals"
  ) + theme_classic() + scale_y_continuous(breaks = seq(-1, 7, by = 2),
    limits = c(-1, 7)) + xlim(-1,1.7) +theme(legend.position = "none") 


p3 <- ggMarginal(
  p3,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)

benign_df <- gck_df %>% 
  filter(clinvar_clinical_significance %in% c("likely_benign", "benign")) %>%
  filter(!ClinVar.Germline.Classification %in% c("Pathogenic", "Pathogenic/Likely pathogenic", "Likely pathogenic",
                                                 "Conflicting classifications of pathogenicity")) %>%
  filter(is.na(Germline.classification))

#nrow(benign_df) #3


p4 <- ggplot(benign_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
  geom_point(size = 2) +
  geom_vline(xintercept = 0.58, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_hline(yintercept = 0.66, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
            inherit.aes = FALSE, color = "black", linewidth = 1) +
  scale_color_viridis(option = "C", direction = 1, limits = c(-6.2, 6.2)) + 
  labs(
    title = "ClinVar Benign Variants",
    x = "Measured abundance",
    y = "Measured activity",
    color = "Loess residuals"
  ) + theme_classic() + scale_y_continuous(breaks = seq(-1, 7, by = 2),
                                           limits = c(-1, 7)) + xlim(-1,1.7) +theme(legend.position = "none") 

p4 <- ggMarginal(
  p4,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)

p5 <- plot_grid(p1,p3,p2, nrow=1, ncol=3)

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_scatter.pdf", 
       plot = p5, width = 9, height = 3, dpi = 300)
p5

p1 <- ggplot(benign_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
  geom_point(size = 2) +
  geom_vline(xintercept = 0.58, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_hline(yintercept = 0.66, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
            inherit.aes = FALSE, color = "black", linewidth = 1) +
  scale_color_viridis(option = "C", direction = 1, limits = c(-6.2, 6.2)) + 
  labs(
    title = "ClinVar Benign Variants",
    x = "Measured abundance",
    y = "Measured activity",
    color = "Loess residuals"
  ) + theme_classic() + scale_y_continuous(breaks = seq(-1, 7, by = 2),
                                           limits = c(-1, 7)) + xlim(-1,1.7) +
  theme(legend.position = "bottom")

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_scatter_legend.pdf", 
       plot = p1, width = 3, height = 3, dpi = 300)
p1

# 1. Set up
set.seed(11)
n_boot <- 1000
match_window <- 0.5

n_patho <- nrow(patho_df)

# 2. Get abundance/residuals from gck_patho
patho_df <- patho_df %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"

# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- gck_df %>%
  filter(!(mutant %in% patho_df$mutant)) 

bootstrap_medians <- vector("numeric", length = n_boot)

# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)

# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)

for (i in 1:n_boot) {
  for (j in 1:n_patho) {
    bin_j <- patho_df$bin[j]
    candidates <- bin_lookup[[as.character(bin_j)]]
    if (!is.null(candidates) && length(candidates) > 0) {
      bootstrap_matrix[i, j] <- sample(candidates, 1)
    }
  }
}

# Summarize into a dataframe
boot_df <- data.frame(
  group = "Random abundance-matched",
  residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE)
)

# 7. Combine with pathogenic residuals
plot_df <- bind_rows(
  patho_df %>% dplyr::select(group, residuals),
  boot_df
)

# 8. Labels for plot
label_df <- plot_df %>%
  group_by(group) %>%
  summarise(
    n = n(),
    median_val = median(residuals),
    y_max = max(residuals),
    .groups = "drop"
  ) %>%
  mutate(n_label = case_when(
    group == "Random abundance-matched" ~ "bootstrapped 1000 times",
    TRUE ~ paste0("n = ", n)
  ))


# Plot
p_fast <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
    stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
               fill = "black", color = "black", stroke = 0.7) +

 geom_text(
  data = label_df,
  aes(x = group, y = 3.5, label = n_label),
  inherit.aes = FALSE,
  size = 4) +

  geom_text(
    data = label_df,
    aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
    inherit.aes = FALSE,
    size = 6
  ) +
  labs(
    x = "",
    y = "Activity-abundance residuals",
    title = "All Variants"
  ) +
  theme_classic(base_size = 14) +
  scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
  theme(legend.position = "none") +
  geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
              map_signif_level = FALSE,
              test = "wilcox.test",
              tip_length = 0.01) 


p_fast

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin1.pdf", 
       plot = p_fast, width = 3, height = 4, dpi = 300)
p_fast

hist(boot_df$residuals, breaks = 30, main = "Bootstrap medians", xlab = "Residuals")

nrow(gnomad_df) #268
## [1] 268
patho_df <- gck_df %>% filter(!is.na(Germline.classification))
#nrow(patho_df) #277
gnomad_df <- gnomad_df %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals)
patho_df1 <- patho_df %>% filter(clean_condition == "HH") %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals)
patho_df2 <- patho_df %>% filter(clean_condition == "MODY") %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals)
patho_df3 <- patho_df %>% filter(clean_condition == "Monogenic diabetes") %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals)

nrow(patho_df1)
## [1] 3
nrow(patho_df2)
## [1] 57
nrow(patho_df3)
## [1] 164
patho_df1$var_class2 <- "HH"
patho_df2$var_class2 <- "MODY"
patho_df3$var_class2 <- "Monogenic diabetes"
gnomad_df$var_class2 <- "gnomAD"


wilcox.test(patho_df1$residuals, gnomad_df$residuals) #0.004004
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  patho_df1$residuals and gnomad_df$residuals
## W = 791, p-value = 0.004004
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df2$residuals, gnomad_df$residuals) #3.87e-07
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  patho_df2$residuals and gnomad_df$residuals
## W = 4368, p-value = 3.87e-07
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df3$residuals, gnomad_df$residuals) #< 2.2e-16
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  patho_df3$residuals and gnomad_df$residuals
## W = 10458, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df1$residuals, patho_df2$residuals) #0.003939
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  patho_df1$residuals and patho_df2$residuals
## W = 171, p-value = 0.003939
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df1$residuals, patho_df3$residuals) #0.003479
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  patho_df1$residuals and patho_df3$residuals
## W = 489, p-value = 0.003479
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df2$residuals, patho_df3$residuals) #0.3465
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  patho_df2$residuals and patho_df3$residuals
## W = 5066, p-value = 0.3465
## alternative hypothesis: true location shift is not equal to 0
combined_df <- rbind(patho_df1,patho_df2,patho_df3,gnomad_df)
median_df <- combined_df %>%
  group_by(var_class2) %>%
  summarise(
    median_residual = median(residuals),
    n = n()
  )

combined_df$var_class2 <- factor(
  combined_df$var_class2,
  levels = c("gnomAD", "HH", "MODY", "Monogenic diabetes")
)

custom_colors <- c(
  "gnomAD" = "#1b9e77",   # Teal green
  "HH"   = "#f4a6b3",   # Warm orange
  "MODY"   = "#e7298a",   # Muted purple
  "Monogenic diabetes"   = "#7570b3"    # Hot pink / magenta
)

p10 <- ggplot(combined_df, aes(x = var_class2, y = residuals, fill = var_class2)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_class2, y = median_residual + 0.5, label = sprintf("%.2f", median_residual)),
    inherit.aes = FALSE,
    size = 5
  ) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_class2, y = 3, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "Across Groups",
    x = "",
    y = "Activity-abundance residuals",
    fill = ""
  ) +
  theme_classic(base_size = 14) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 20, hjust = 1)
  ) +
    geom_signif(
    comparisons = list(c("gnomAD", "HH"), 
                       c("gnomAD", "MODY"), 
                       c("gnomAD", "Monogenic diabetes")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    textsize = 3
  ) 

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin2.pdf", 
       plot = p10, width = 3, height = 4, dpi = 300)
p10

#nrow(gnomad_df) #268
#nrow(patho_df) #277
#length(unique(patho_df$mutant)) #277
#table(patho_df$clean_condition)
#HH              Mixed               MODY Monogenic diabetes       Not provided              Other 
#3                 14                 57                164                 36                  3

patho_df <- patho_df %>% filter(clean_condition %in% c("HH","MODY","Monogenic diabetes") )
gnomad_df$var_source <- "gnomAD"
patho_df$var_source <- "Pathogenic"

gnomad_df <- gnomad_df %>% dplyr::select(mutant, residuals, var_source,DMS_score_abundance)
patho_df <- patho_df %>% dplyr::select(mutant, residuals, var_source,DMS_score_abundance)
combined_df <- rbind(gnomad_df, patho_df)

combined_df <- combined_df %>%
  mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))

combined_df_int <- combined_df %>% filter(mutation_position %in% active_positions)
nrow(combined_df_int) #71
## [1] 71
#length(unique(combined_df_int$mutant)) #71

median_df <- combined_df_int %>%
  group_by(var_source) %>%
  summarise(
    median_residual = median(residuals, na.rm = TRUE),
    n = n()
  )
#median_df

#table(combined_df_int$var_source)
custom_colors <- c(
  "gnomAD"     = "#1b9e77",  # Teal green (unchanged)
  "Pathogenic" = "#de425b"   # Deep ocean blue
)
p13 <- ggplot(combined_df_int, aes(x = var_source, y = residuals, fill = var_source)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
    inherit.aes = FALSE,
    size = 5
  ) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = 3, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "71 Orthosteric Variants",
    x = "",
    y = "Activity_abundance residuals",
    fill = ""
  ) +
  theme_classic() +
  theme(
    legend.position = "none"
  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "Pathogenic")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1, y_position = 3.5, 
    textsize = 3
  )  + ylim(-5, 5)
p13

combined_df_out <- combined_df %>% filter(!mutation_position %in% active_positions)
nrow(combined_df_out) #421
## [1] 421
median_df <- combined_df_out %>%
  group_by(var_source) %>%
  summarise(
    median_residual = median(residuals, na.rm = TRUE),
    n = n()
  )
#median_df

p14 <- ggplot(combined_df_out, aes(x = var_source, y = residuals, fill = var_source)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
    inherit.aes = FALSE,
    size = 5
  ) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = 3, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "421 Non-orthosteric Variants",
    x = "",
    y = "Activity-abundance residuals",
    fill = ""
  ) +
  theme_classic() +
  theme(
    legend.position = "none"
  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "Pathogenic")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1, y_position = 3.5, 
    textsize = 3
  )  + ylim(-5, 5)
p14

combined_df_stable <- combined_df %>% filter(DMS_score_abundance > 0.58)
nrow(combined_df_stable) #350
## [1] 350
#table(combined_df_stable$var_source)

combined_df_stable_int <- combined_df_stable %>% filter(mutation_position %in% active_positions)
nrow(combined_df_stable_int) #59
## [1] 59
median_df <- combined_df_stable_int %>%
  group_by(var_source) %>%
  summarise(
    median_residual = median(residuals, na.rm = TRUE),
    n = n()
  )
#median_df

p15 <- ggplot(combined_df_stable_int, aes(x = var_source, y = residuals, fill = var_source)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
    inherit.aes = FALSE,
    size = 5
  ) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = 3, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "59 WT-abundance Orthosteric Variants",
    x = "",
    y = "Activity-abundance residuals",
    fill = ""
  ) +
  theme_classic() +
  theme(
    legend.position = "none"
  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "Pathogenic")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1, y_position = 3.5, 
    textsize = 3
  ) + ylim(-5, 5)


combined_df_stable_out <- combined_df_stable %>% filter(!mutation_position %in% active_positions)
nrow(combined_df_stable_out) #291
## [1] 291
median_df <- combined_df_stable_out %>%
  group_by(var_source) %>%
  summarise(
    median_residual = median(residuals, na.rm = TRUE),
    n = n()
  )
#median_df

p16 <- ggplot(combined_df_stable_out, aes(x = var_source, y = residuals, fill = var_source)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
    inherit.aes = FALSE,
    size = 5
  ) +
  scale_fill_manual(values = custom_colors) +
  geom_text(
    data = median_df,
    aes(x = var_source, y = 3, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4.5
  ) +
  labs(
    title = "291 WT-abundance Non-orthosteric Variants",
    x = "",
    y = "Activity-abundance residuals",
    fill = ""
  ) +
  theme_classic() +
  theme(
    legend.position = "none"
  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "Pathogenic")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1, y_position = 3.5, 
    textsize = 3
  )  + ylim(-5, 5)


p17 <- plot_grid(p13,p14,p15,p16,ncol=4,nrow=1)
p17

p18 <- plot_grid(p13,p14,ncol=2,nrow=1)

p19 <- plot_grid(p15,p16,ncol=2,nrow=1)

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin3.pdf", 
       plot = p18, width = 5, height = 3, dpi = 300)

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin4.pdf", 
       plot = p19, width = 5, height = 3, dpi = 300)

p18

p19

pdb <- read.pdb("~/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/GCK/1v4s.pdb")
#unique(pdb$atom$resid)
# Extract C-alpha atoms from protein (exclude ligand)
protein_ca <- pdb$atom[pdb$atom$elety == "CA" & pdb$atom$resid != "GLC" & pdb$atom$resid != "MRK" & pdb$atom$resid != "HOH", ]

# Extract ligand atoms (TLA residue)
ligand_atoms <- pdb$atom[pdb$atom$resid == "GLC" & pdb$atom$type == "HETATM", ]

# Calculate min distance from each C-alpha to the ligand
protein_ca$min_dist_to_ligand <- apply(protein_ca, 1, function(atom) {
  dists <- sqrt((as.numeric(atom[["x"]]) - as.numeric(ligand_atoms$x))^2 +
                  (as.numeric(atom[["y"]]) - as.numeric(ligand_atoms$y))^2 +
                  (as.numeric(atom[["z"]]) - as.numeric(ligand_atoms$z))^2)
  return(min(dists))
})

# View summary
#nrow(protein_ca) #448
#summary(protein_ca$min_dist_to_ligand)
#head(protein_ca$min_dist_to_ligand)
#length(protein_ca$min_dist_to_ligand) #448

#nrow(gck_df_merged)
fil_protein_ca <- protein_ca %>% dplyr::select(resid, resno, min_dist_to_ligand)
active_positions <- c(151:179, # disordered loop
                      151-153, 168-169, 204-206, 225-231, 254-258, 287, 290, # glucose-binding
                      78:85, 151, 169, 205, 225:229, 295:296, 331:333, 336, 410:416 # ATP-binding
)

active_sites <- c(151:179, # disordered loop
                  78:85, 151, 169, 205, 225:229, 295:296, 331:333, 336, 410:416) # ATP-binding

binding_sites <- c(151:153, 168:169, 204:206, 225:231, 254:258, 287, 290) # glucose-binding)

merged_df <- merge(gck_df, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
#nrow(merged_df) #8255

merged_df <- merged_df %>% dplyr::select(-sequence)
nrow(merged_df)
## [1] 8255
merged_df <- merged_df[!is.na(merged_df$min_dist_to_ligand),]
nrow(merged_df) #7969
## [1] 7969
gnomad_df <- merged_df %>% filter(mutant %in% gnomad_df$mutant)
nrow(gnomad_df)
## [1] 251
patho_df1 <- merged_df %>% filter(mutant %in% patho_df1$mutant)
nrow(patho_df1)
## [1] 3
patho_df2 <- merged_df %>% filter(mutant %in% patho_df2$mutant)
nrow(patho_df2)
## [1] 57
patho_df3 <- merged_df %>% filter(mutant %in% patho_df3$mutant)
nrow(patho_df3)
## [1] 164
gnomad_df <- gnomad_df %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals, min_dist_to_ligand)
patho_df1 <- patho_df1 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals, min_dist_to_ligand)
patho_df2 <- patho_df2 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals, min_dist_to_ligand)
patho_df3 <-patho_df3 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals, min_dist_to_ligand)

patho_df1$var_class2 <- "HH"
patho_df2$var_class2 <- "MODY"
patho_df3$var_class2 <- "Monogenic diabetes"
gnomad_df$var_class2 <- "gnomAD"

combined_df <- rbind(patho_df1,patho_df2,patho_df3,gnomad_df)
median_df <- combined_df %>%
  group_by(var_class2) %>%
  summarise(
    median_dist = median(min_dist_to_ligand),
    n = n()
  )
median_df
## # A tibble: 4 × 3
##   var_class2         median_dist     n
##   <chr>                    <dbl> <int>
## 1 HH                        16.6     3
## 2 MODY                      16.6    57
## 3 Monogenic diabetes        15.6   164
## 4 gnomAD                    21.4   251
combined_df$var_class2 <- factor(
  combined_df$var_class2,
  levels = c("gnomAD", "HH", "MODY", "Monogenic diabetes")
)

custom_colors <- c(
  "gnomAD" = "#1b9e77",   # Teal green
  "HH"   = "#f4a6b3",   # Warm orange
  "MODY"   = "#e7298a",   # Muted purple
  "Monogenic diabetes"   = "#7570b3"    # Hot pink / magenta
)

combined_df <- combined_df %>%
  mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))

combined_df$site_type <- "Non-orthosteric site"
combined_df$site_type[combined_df$mutation_position %in% active_sites] <- "Active site"
combined_df$site_type[combined_df$mutation_position %in% binding_sites] <- "Binding site"
table(combined_df$site_type)
## 
##          Active site         Binding site Non-orthosteric site 
##                   48                   26                  401
nrow(combined_df)
## [1] 475
p1 <- ggplot(combined_df, aes(x = var_class2, y = min_dist_to_ligand, fill = var_class2)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
 geom_jitter(aes(shape = site_type), width = 0.15, size = 2, alpha = 0.7,
            fill = "lightgrey", color = "darkgrey", stroke = 0.5)+
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_class2, y =  median_dist + 1, label = sprintf("%.2f",  median_dist)),
    inherit.aes = FALSE,
    size = 5
  ) +
  scale_fill_manual(values = custom_colors, guide = "none") +
  scale_shape_manual(values = c(
  "Non-orthosteric site" = 21,
  "Active site" = 22,
  "Binding site" = 23
)) +
  geom_text(
    data = median_df,
    aes(x = var_class2, y = 40, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  labs(
    title = "475 Variants",
    x = "",
    y = "Distance to glucose",
    fill = "",
    shape = "Site Type"
  ) +
  theme_classic() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)

  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "HH"), 
                       c("gnomAD", "MODY"), 
                       c("gnomAD", "Monogenic diabetes")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    textsize = 3
  ) 
p1

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin5.pdf", 
       plot = p1, width = 3, height = 4, dpi = 300)
p1

ggplot(combined_df, aes(x = var_class2, y = min_dist_to_ligand, fill = var_class2)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
 geom_jitter(aes(shape = site_type), width = 0.15, size = 2, alpha = 0.7,
            fill = "lightgrey", color = "darkgrey", stroke = 0.5)+
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = median_df,
    aes(x = var_class2, y =  median_dist + 1, label = sprintf("%.2f",  median_dist)),
    inherit.aes = FALSE,
    size = 5
  ) +
  scale_fill_manual(values = custom_colors, guide = "none") +
  scale_shape_manual(values = c(
  "Non-orthosteric site" = 21,
  "Active site" = 22,
  "Binding site" = 23
)) +
  geom_text(
    data = median_df,
    aes(x = var_class2, y = 40, label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  labs(
    title = "475 Variants",
    x = "",
    y = "Distance to glucose",
    fill = "",
    shape = "Site Type"
  ) +
  theme_classic() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)

  ) +
  geom_signif(
    comparisons = list(c("gnomAD", "HH"), 
                       c("gnomAD", "MODY"), 
                       c("gnomAD", "Monogenic diabetes")),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    textsize = 3
  ) 

nrow(merged_df) #7969
## [1] 7969
merged_df_neg <- merged_df %>% filter(residuals <=0)
nrow(merged_df_neg) #4417
## [1] 4417
merged_df_residue <- merged_df_neg %>%
  group_by(mutation_position) %>%
  summarise(
    loess_residual_avg = median(residuals, na.rm = TRUE))
    
merged_df_residue <- merge(merged_df_residue, protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
    
merged_df_residue <- merged_df_residue[!is.na(merged_df_residue$min_dist_to_ligand), ]
nrow(merged_df_residue) #445
## [1] 445
# Fit exponential decay: y = a * exp(-b * x) 
exp_model <- nls(abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand),
                 data = merged_df_residue,
                 start = list(a = 1, b = 0.1))
exp_model
## Nonlinear regression model
##   model: abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand)
##    data: merged_df_residue
##      a      b 
## 0.9493 0.0438 
##  residual sum-of-squares: 19.54
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 8.104e-07
# Nonlinear regression model
#   model: abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand)
#    data: merged_df_residue
#      a      b 
# 0.9493 0.0438 
#  residual sum-of-squares: 19.54
# 
# Number of iterations to convergence: 4 
# Achieved convergence tolerance: 8.104e-07
a <- coef(exp_model)[["a"]]
b <- coef(exp_model)[["b"]]

merged_df_neg <- merged_df_neg %>%
  mutate(expected_res = a * exp(-b * min_dist_to_ligand),
         residual = abs(residuals) - expected_res)

# One-sided Wilcoxon signed-rank test for each residue
wilcox_test_results <- merged_df_neg %>%
  group_by(mutation_position) %>%
  summarise(
    p_wilcox = tryCatch(wilcox.test(residual, mu = 0, alternative = "greater")$p.value, error = function(e) NA_real_),
    .groups = "drop"
  ) %>%
  mutate(p_adj = p.adjust(p_wilcox, method = "fdr")) 

merged_df_residue <- left_join(merged_df_residue, wilcox_test_results, by = "mutation_position") %>%
  mutate(
    hotspot = !is.na(p_adj) & p_adj < 0.05,
    hotspot_label = ifelse(hotspot, as.character(mutation_position), NA)
  )

merged_df_residue %>% filter(hotspot == TRUE) %>% pull(mutation_position)
##  [1]  76  82  85  86 118 119 122 123 125 131 134 140 141 142 143 146 148 152 156
## [20] 158 159 160 162 163 164 168 170 171 173 175 178 180 181 183 184 185 186 188
## [39] 189 191 192 195 196 197 199 202 203 228 246 295 445 448 449 450 451 453
half_d <- log(2) / b  
half_d #15.82528
## [1] 15.82366
x_vals <- seq(min(merged_df_residue$min_dist_to_ligand),
              max(merged_df_residue$min_dist_to_ligand), length.out = 200)

# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
  samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
  fit <- try(nlsLM(abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand),
                   data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
  if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]

boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))

fit_df_residue <- data.frame(
  min_dist_to_ligand = x_vals,
  loess_residual_avg = predict(exp_model, newdata = data.frame(min_dist_to_ligand = x_vals)),
  lower = apply(boot_preds, 1, quantile, probs = 0.025),
  upper = apply(boot_preds, 1, quantile, probs = 0.975)
)
merged_df_residue$site_type <- "Non-orthosteric site"
#merged_df_residue$site_type[abs(merged_df_residue$loess_residual_avg) <= y_cutoff] <- "Null"
merged_df_residue$site_type[merged_df_residue$mutation_position %in% active_sites] <- "ATP-binding site"
merged_df_residue$site_type[merged_df_residue$mutation_position %in% binding_sites] <- "Glucose-binding site"

max(merged_df_residue %>% filter (site_type == "Glucose-binding site") %>% pull(min_dist_to_ligand)) #8.666125
## [1] 8.666125
table(merged_df_residue$site_type)
## 
##     ATP-binding site Glucose-binding site Non-orthosteric site 
##                   45                   22                  378
nrow(merged_df_residue) #445
## [1] 445
# Plot with fitted curve
p23 <- ggplot(merged_df_residue, aes(x = min_dist_to_ligand, y = abs(loess_residual_avg))) +
  # First, plot NULL points separately with alpha = 0.2
  geom_point(aes(color = site_type), size = 1) +
  geom_text_repel(data = merged_df_residue %>% filter(hotspot) %>% filter (site_type != "ATP-binding site"), aes(label = hotspot_label, color = site_type), size = 3.5, max.overlaps = 50) +
  geom_ribbon(data = fit_df_residue,
            aes(x = min_dist_to_ligand, ymin = lower, ymax = upper),
            fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
  geom_line(data = fit_df_residue, aes(x = min_dist_to_ligand, y = abs(loess_residual_avg)),
          inherit.aes = FALSE, color = "black", size = 1) +
  scale_color_manual(values = c(
    "Null" = "grey",
    "Non-orthosteric site" = "darkgreen",
    "ATP-binding site" = "cyan",
    "Glucose-binding site" = "orange"
  )) +
  theme_classic() +
  geom_vline(xintercept = 8.666125, linetype = "dashed", color = "slategrey") +
  labs(
    title = "GCK: per-residue allosteric decay",
    subtitle = "445 residues",
    x = "Minimal distance to glucose",
    y = "|Median activity-abundance residual|",
    color = ""
  )  + theme(legend.position = "none") 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p23 <- ggMarginal(
  p23,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)
p23

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_decay1.pdf", 
       plot = p23, width = 5, height = 5, dpi = 300)
p23

lm_model <- lm(log(abs(loess_residual_avg)) ~ min_dist_to_ligand, data = merged_df_residue)
summary(lm_model)
## 
## Call:
## lm(formula = log(abs(loess_residual_avg)) ~ min_dist_to_ligand, 
##     data = merged_df_residue)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4024 -0.3130  0.1089  0.4467  1.5179 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.173587   0.085822  -2.023   0.0437 *  
## min_dist_to_ligand -0.045660   0.004112 -11.104   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6397 on 443 degrees of freedom
## Multiple R-squared:  0.2177, Adjusted R-squared:  0.2159 
## F-statistic: 123.3 on 1 and 443 DF,  p-value: < 2.2e-16
# Call:
# lm(formula = log(abs(loess_residual_avg)) ~ min_dist_to_ligand, 
#     data = merged_df_residue)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -3.4024 -0.3130  0.1089  0.4467  1.5179 
# 
# Coefficients:
#                     Estimate Std. Error t value Pr(>|t|)    
# (Intercept)        -0.173587   0.085822  -2.023   0.0437 *  
# min_dist_to_ligand -0.045660   0.004112 -11.104   <2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 0.6397 on 443 degrees of freedom
# Multiple R-squared:  0.2177,  Adjusted R-squared:  0.2159 
# F-statistic: 123.3 on 1 and 443 DF,  p-value: < 2.2e-16
merged_df <- merge(gck_df, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
#nrow(merged_df) #8255

merged_df <- merged_df %>% dplyr::select(-sequence)
merged_df <- merged_df[!is.na(merged_df$min_dist_to_ligand),]

merged_df_neg <- merged_df %>% filter(residuals <= 0)
nrow(merged_df_neg) #2003
## [1] 4417
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model <- nls(abs(residuals) ~ a * exp(-b * min_dist_to_ligand),
                 data = merged_df_neg,
                 start = list(a = 1, b = 0.1))
exp_model
## Nonlinear regression model
##   model: abs(residuals) ~ a * exp(-b * min_dist_to_ligand)
##    data: merged_df_neg
##       a       b 
## 0.90290 0.03361 
##  residual sum-of-squares: 431
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 2.906e-07
# Nonlinear regression model
#   model: abs(residuals) ~ a * exp(-b * min_dist_to_ligand)
#    data: merged_df
#       a       b 
# 0.90290 0.03361 
#  residual sum-of-squares: 431
# 
# Number of iterations to convergence: 4 
# Achieved convergence tolerance: 2.906e-07
a <- coef(exp_model)[["a"]]
b <- coef(exp_model)[["b"]]

merged_df_neg <- merged_df_neg %>%
  mutate(
    expected_res = a * exp(-b * min_dist_to_ligand),
    observed_abs_res = abs(residuals),
    interpolated_upper = approx(x = fit_df_residue$min_dist_to_ligand,
                                y = fit_df_residue$upper,
                                xout = min_dist_to_ligand,
                                rule = 2)$y,
    sig_above_curve = observed_abs_res > interpolated_upper
  )

# 4. Get mutations significantly above the curve
significant_mutations <- merged_df_neg %>%
  filter(sig_above_curve) 
nrow(merged_df_neg)
## [1] 4417
merged_df_neg$site_type <- "Non-orthosteric site"
merged_df_neg$site_type[merged_df_neg$mutation_position %in% active_sites] <- "ATP-binding site"
merged_df_neg$site_type[merged_df_neg$mutation_position %in% binding_sites] <- "Glucose-binding site"

merged_df_neg$pathogenic_status <- ifelse(
  merged_df_neg$clinvar_clinical_significance %in% c("pathogenic", "likely_pathogenic"),
  "Pathogenic", "Other"
)

table(merged_df_neg$site_type)
## 
##     ATP-binding site Glucose-binding site Non-orthosteric site 
##                  683                  346                 3388
half_d <- log(2) / b  #20.62324 
half_d
## [1] 20.62314
merged_df_neg$site_type <- "Non-orthosteric site"
merged_df_neg$site_type[merged_df_neg$mutation_position %in% active_sites] <- "ATP-binding site"
merged_df_neg$site_type[merged_df_neg$mutation_position %in% binding_sites] <- "Glucose-binding site"

x_vals <- seq(min(merged_df_neg$min_dist_to_ligand),
              max(merged_df_neg$min_dist_to_ligand), length.out = 200)

# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
  samp <- merged_df_neg[sample(nrow(merged_df_neg), replace = TRUE), ]
  fit <- try(nlsLM(abs(residuals) ~ a * exp(-b * min_dist_to_ligand),
                   data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
  if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]

boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))

fit_df_residue <- data.frame(
  min_dist_to_ligand = x_vals,
  residuals = predict(exp_model, newdata = data.frame(min_dist_to_ligand = x_vals)),
  lower = apply(boot_preds, 1, quantile, probs = 0.025),
  upper = apply(boot_preds, 1, quantile, probs = 0.975)
)

merged_df_neg$pathogenic_status <- ifelse(
  merged_df_neg$clinvar_clinical_significance %in% c("pathogenic", "likely_pathogenic"),
  "Pathogenic", "Other"
)


# Plot
p24 <- ggplot(merged_df_neg, aes(x = min_dist_to_ligand, y = abs(residuals))) +
  # Base layer: all points
  geom_point(aes(color = site_type, alpha = pathogenic_status, shape = pathogenic_status), size = 2) +
  geom_ribbon(data = fit_df_residue,
            aes(x = min_dist_to_ligand, ymin = lower, ymax = upper),
            fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
  geom_line(data = fit_df_residue, aes(x = min_dist_to_ligand, y = abs(residuals)),
          inherit.aes = FALSE, color = "black", size = 1) +
  geom_text_repel(
    data = merged_df_neg %>% filter(sig_above_curve) %>% filter(pathogenic_status == "Pathogenic") %>% filter(site_type != "ATP-binding site"),
    aes(label = mutant, color = site_type),
    size = 3.5, max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
  ) +
  # Manual color palette for site type
  scale_color_manual(values = c(
    #"Null" = "grey",
    "Non-orthosteric site" = "darkgreen",
    "ATP-binding site" = "cyan",
    "Glucose-binding site" = "orange"
  )) +
  
  # Transparency scale
  scale_alpha_manual(values = c("Other" = 0.1, "Pathogenic" = 1)) +
  # Reference lines
  geom_vline(xintercept = 8.666125, linetype = "dashed", color = "slategrey") +
  theme_classic() +
  labs(
    title = "GCK: per-mutation allosteric decay",
    x = "Minimal distance to glucose",
    y = "|Activity-abundance residual|",
    color = ""
  ) + theme(legend.position = "none")

p24 <- ggMarginal(
  p24,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_decay2.pdf", 
       plot = p24, width = 5, height = 5, dpi = 300)
p24

# merged_df_neg$pathogenic_status <- ifelse(
#   merged_df_neg$clinvar_clinical_significance %in% c("pathogenic","likely_pathogenic"),
#   "Pathogenic", "Other"
# )
# 
# merged_df_neg$curve_residual <- merged_df_neg$observed_abs_res - merged_df_neg$expected_res
# 
# merged_df_neg_patho_nonortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>%
#   filter(site_type == "Non-orthosteric site")
# nrow(merged_df_neg_patho_nonortho) #72
# 
# merged_df_neg_non_patho_nonortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>%
#   filter(site_type == "Non-orthosteric site")
# nrow(merged_df_neg_non_patho_nonortho) #3316
# 
# wilcox.test(merged_df_neg_patho_nonortho$curve_residual, merged_df_neg_non_patho_nonortho$curve_residual)
# # 1. Combine data
# plot_df <- bind_rows(
#   merged_df_neg_patho_nonortho %>%
#     mutate(group = "Pathogenic"),
#   merged_df_neg_non_patho_nonortho %>%
#     mutate(group = "Non-pathogenic")
# )
# 
# # 2. Summary stats for labeling
# label_df <- plot_df %>%
#   group_by(group) %>%
#   summarise(
#     n_label = paste0("n = ", n()),
#     median_val = median(curve_residual, na.rm = TRUE)
#   )
# plot_df$group <- factor(plot_df$group, levels = c("Pathogenic", "Non-pathogenic"))
# 
# # 3. Plot
# p_violin <- ggplot(plot_df, aes(x = group, y = curve_residual, fill = group)) +
#   geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
#   geom_boxplot(width = 0.5, outlier.shape = NA, alpha = 0.6, color = "lightgrey") +
#   stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
#   stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
#                fill = "black", color = "black", stroke = 0.7) +
# 
#   # Add n labels
#   geom_text(
#     data = label_df,
#     aes(x = group, y = max(plot_df$curve_residual, na.rm = TRUE) + 0.2, label = n_label),
#     inherit.aes = FALSE,
#     size = 4
#   ) +
# 
#   # Add median labels
#   geom_text(
#     data = label_df,
#     aes(x = group, y = median_val + 0.15, label = sprintf("%.2f", median_val)),
#     inherit.aes = FALSE,
#     size = 6
#   ) +
# 
#   labs(
#     x = "",
#     y = "Residual to exponential curve",
#     title = "Non-orthosteric Variants"
#   ) +
#   theme_classic(base_size = 14) +
#   scale_fill_manual(values = c(
#     "Pathogenic" = "indianred3",
#     "Non-pathogenic" = "tan3"
#   )) +
#   theme(legend.position = "none") +
# 
#   # Wilcoxon significance
#   geom_signif(comparisons = list(c("Pathogenic", "Non-pathogenic")),
#               map_signif_level = FALSE,
#               test = "wilcox.test",
#               tip_length = 0.01)
# 
# # Show plot
# p_violin
# merged_df_neg_patho_ortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>%
#   filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_patho_ortho) #25
# merged_df_neg_non_patho_ortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>%
#   filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_non_patho_ortho) #1004
# wilcox.test(merged_df_neg_patho_ortho$curve_residual, merged_df_neg_non_patho_ortho$curve_residual)
# # 1. Combine data
# plot_df2 <- bind_rows(
#   merged_df_neg_patho_ortho %>%
#     mutate(group = "Pathogenic"),
#   merged_df_neg_non_patho_ortho %>%
#     mutate(group = "Non-pathogenic")
# )
# 
# # 2. Summary stats for labeling
# label_df2 <- plot_df2 %>%
#   group_by(group) %>%
#   summarise(
#     n_label = paste0("n = ", n()),
#     median_val = median(curve_residual, na.rm = TRUE)
#   )
# plot_df2$group <- factor(plot_df2$group, levels = c("Pathogenic", "Non-pathogenic"))
# 
# # 3. Plot
# p_violin2 <- ggplot(plot_df2, aes(x = group, y = curve_residual, fill = group)) +
#   geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
#   geom_boxplot(width = 0.5, outlier.shape = NA, alpha = 0.6, color = "lightgrey")  +
#   stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
#   stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
#                fill = "black", color = "black", stroke = 0.7) +
# 
#   # Add n labels
#   geom_text(
#     data = label_df2,
#     aes(x = group, y = max(plot_df2$curve_residual, na.rm = TRUE) + 0.2, label = n_label),
#     inherit.aes = FALSE,
#     size = 4
#   ) +
# 
#   # Add median labels
#   geom_text(
#     data = label_df2,
#     aes(x = group, y = median_val + 0.5, label = sprintf("%.2f", median_val)),
#     inherit.aes = FALSE,
#     size = 6
#   ) +
# 
#   labs(
#     x = "",
#     y = "Residual to exponential curve",
#     title = "GCK:Orthosteric Variants"
#   ) +
#   theme_classic(base_size = 14) +
#   scale_fill_manual(values = c(
#     "Pathogenic" = "indianred3",
#     "Non-pathogenic" = "tan3"
#   )) +
#   theme(legend.position = "none") +
# 
#   # Wilcoxon significance
#   geom_signif(comparisons = list(c("Pathogenic", "Non-pathogenic")),
#               map_signif_level = FALSE,
#               test = "wilcox.test",
#               tip_length = 0.01)
# 
# # Show plot
# p_violin2
# plot <- plot_grid(p_violin2, p_violin, ncol = 1, nrow =2)
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin6.pdf", 
#        plot = plot, width = 3, height = 6, dpi = 300)
# plot
# merged_df_neg_patho_ortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>%
#   filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_patho_ortho) #25
# merged_df_neg_non_patho_ortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>%
#   filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_non_patho_ortho) #1004
# wilcox.test(merged_df_neg_patho_ortho$curve_residual, merged_df_neg_non_patho_ortho$curve_residual)
# table(merged_df_neg$pathogenic_status)
# 
# merged_df_neg_patho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic")
# table(merged_df_neg_patho$sig_above_curve)
# 
# merged_df_neg_patho_ortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>% filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_patho_ortho )
# table(merged_df_neg_patho_ortho$sig_above_curve)
# #18/(18+7) = 0.72
# 
# merged_df_neg_patho_nonortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>% filter(site_type == "Non-orthosteric site")
# table(merged_df_neg_patho_nonortho$sig_above_curve)
# #22/(50+22) = 0.3055556
# plot_df <- merged_df_neg %>%
#   filter(pathogenic_status == "Pathogenic") %>%
#   mutate(
#     group = ifelse(site_type == "Non-orthosteric site", "Non-orthosteric", "Orthosteric"),
#     group = factor(group, levels = c("Orthosteric", "Non-orthosteric"))
#   ) %>%
#   group_by(group, sig_above_curve) %>%
#   summarise(count = n(), .groups = "drop") %>%
#   group_by(group) %>%
#   mutate(percentage = 100 * count / sum(count))
# 
# # Plot
# p2 <- ggplot(plot_df, aes(x = group, y = percentage, fill = sig_above_curve)) +
#   geom_col(width = 0.6) +
#   geom_text(
#     aes(label = sprintf("%d (%.0f%%)", count, percentage)),
#     position = position_stack(vjust = 0.5),
#     size = 4
#   ) +
#   scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
#   scale_fill_manual(
#     values = c("TRUE" = "indianred3", "FALSE" = "grey80"),
#     labels = c("Not above curve", "Above curve")
#   ) +
#   labs(
#     x = "",
#     y = "Variant percentage (%)",
#     title = "GCK: 97 Pathogenic Variants",
#         subtitle = "With negative activity-abundance residuals",
#     fill = "") +
#   theme_classic(base_size = 14) + theme(legend.position = "bottom")
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_boxplot1.pdf", 
#        plot = p2, width = 4, height = 5, dpi = 300)
# p2
# table(merged_df_neg$pathogenic_status)
# 
# merged_df_neg_non_patho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic")
# table(merged_df_neg_patho$sig_above_curve)
# nrow(merged_df_neg_non_patho)
# merged_df_neg_non_patho_ortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>% filter(site_type != "Non-orthosteric site")
# table(merged_df_neg_non_patho_ortho$sig_above_curve)
# 
# merged_df_neg_non_patho_nonortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>% filter(site_type == "Non-orthosteric site")
# table(merged_df_neg_non_patho_nonortho$sig_above_curve)
# plot_df <- merged_df_neg %>%
#   filter(pathogenic_status != "Pathogenic") %>%
#   mutate(
#     group = ifelse(site_type == "Non-orthosteric site", "Non-orthosteric", "Orthosteric"),
#     group = factor(group, levels = c("Orthosteric", "Non-orthosteric"))
#   ) %>%
#   group_by(group, sig_above_curve) %>%
#   summarise(count = n(), .groups = "drop") %>%
#   group_by(group) %>%
#   mutate(percentage = 100 * count / sum(count))
# 
# # Plot
# p3 <- ggplot(plot_df, aes(x = group, y = percentage, fill = sig_above_curve)) +
#   geom_col(width = 0.6) +
#   geom_text(
#     aes(label = sprintf("%d (%.0f%%)", count, percentage)),
#     position = position_stack(vjust = 0.5),
#     size = 4
#   ) +
#   scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
#   scale_fill_manual(
#     values = c("TRUE" = "indianred3", "FALSE" = "grey80"),
#     labels = c("Not above curve", "Above curve")
#   ) +
#   labs(
#     x = "",
#     y = "Variant percentage (%)",
#     title = "GCK: 4320 Non-pathogenic Variants",
#         subtitle = "With negative activity-abundance residuals",
#     fill = "") +
#   theme_classic(base_size = 14) + theme(legend.position = "bottom")
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_boxplot2.pdf", 
#        plot = p3, width = 4, height = 5, dpi = 300)
# p3
lm_model <- lm(log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
summary(lm_model)
## 
## Call:
## lm(formula = log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7944 -0.4913  0.2377  0.7202  2.7754 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.588428   0.035433  -16.61   <2e-16 ***
## min_dist_to_ligand -0.021230   0.001693  -12.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.114 on 7967 degrees of freedom
## Multiple R-squared:  0.01935,    Adjusted R-squared:  0.01923 
## F-statistic: 157.2 on 1 and 7967 DF,  p-value: < 2.2e-16
# Call:
# lm(formula = log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -8.7790 -0.4103  0.2504  0.6455  1.8923 
# 
# Coefficients:
#                     Estimate Std. Error t value Pr(>|t|)    
# (Intercept)        -0.233775   0.039517  -5.916 3.55e-09 ***
# min_dist_to_ligand -0.044398   0.002099 -21.154  < 2e-16 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 1.001 on 4415 degrees of freedom
# Multiple R-squared:  0.09203, Adjusted R-squared:  0.09183 
# F-statistic: 447.5 on 1 and 4415 DF,  p-value: < 2.2e-16
nrow(merged_df) #4417
## [1] 7969
#merged_df <- merge(gck_df, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
#nrow(merged_df) #8255

#merged_df <- merged_df %>% dplyr::select(-sequence)
merged_df <- gck_df
nrow(merged_df) #8255
## [1] 8255
#merged_df <- merged_df[!is.na(merged_df$min_dist_to_ligand),]
#nrow(merged_df) #7969

merged_df$pathogenic_status <- ifelse(
  merged_df$clean_condition %in% c(
    "HH", "MODY",
    "Monogenic diabetes"),
  "Pathogenic", "Other"
)

merged_df_int <- merged_df %>% filter(mutation_position %in% active_positions)
nrow(merged_df_int) #1031
## [1] 1031
table(merged_df_int$pathogenic_status)
## 
##      Other Pathogenic 
##        977         54
# Other Pathogenic 
#        977       54

merged_df_out <- merged_df %>% filter(!mutation_position %in% active_positions)
nrow(merged_df_out) #7224
## [1] 7224
table(merged_df_out$pathogenic_status)
## 
##      Other Pathogenic 
##       7054        170
     # Other Pathogenic 
     #  7054        170
table(merged_df$clean_condition)
## 
##                 HH              Mixed               MODY Monogenic diabetes 
##                  3                 14                 57                164 
##       Not provided              Other 
##                 36                  3
# 1. Set up
set.seed(123)
n_boot <- 1000
match_window <- 0.5
n_patho <- nrow(merged_df_int %>% filter(pathogenic_status == "Pathogenic"))

# 2. Get abundance/residuals from pten_patho
patho_df <- merged_df_int %>% filter(pathogenic_status == "Pathogenic") %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"

# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- merged_df_int %>% filter(pathogenic_status != "Pathogenic") %>%
  filter(!(mutant %in% patho_df$mutant)) 

bootstrap_medians <- vector("numeric", length = n_boot)

# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)

# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)

for (i in 1:n_boot) {
  for (j in 1:n_patho) {
    bin_j <- patho_df$bin[j]
    candidates <- bin_lookup[[as.character(bin_j)]]
    if (!is.null(candidates) && length(candidates) > 0) {
      bootstrap_matrix[i, j] <- sample(candidates, 1)
    }
  }
}

# Summarize into a dataframe
boot_df <- data.frame(
  group = "Random abundance-matched",
  residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE)   # Mean across matches per bootstrap
)

# Combine with patho residuals
plot_df <- bind_rows(
  patho_df %>% dplyr::select(group, residuals),
  boot_df
)

label_df <- plot_df %>%
  group_by(group) %>%
  summarise(
    n = n(),
    median_val = median(residuals),
    y_max = max(residuals),
    .groups = "drop"
  )

label_df <- label_df %>%
  mutate(n_label = case_when(
    group == "Random abundance-matched" ~ "bootstrapped 1000 times",
    TRUE ~ paste0("n = ", n)
  ))

# Plot
p_fast <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
    stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
               fill = "black", color = "black", stroke = 0.7) +

 geom_text(
  data = label_df,
  aes(x = group, y = 2.5, label = n_label),
  inherit.aes = FALSE,
  size = 4) +

  geom_text(
    data = label_df,
    aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
    inherit.aes = FALSE,
    size = 6
  ) +
  labs(
    x = "",
    y = "Activity-abundance residuals",
    title = "Orthosteric variants"
  ) +
  theme_classic(base_size = 14) +
  scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
  theme(legend.position = "none") +
  geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
              map_signif_level = FALSE,
              test = "wilcox.test",
              tip_length = 0.01)

p_fast

# 1. Set up
set.seed(11)
n_boot <- 1000
match_window <- 0.5
n_patho <- nrow(merged_df_out %>% filter(pathogenic_status == "Pathogenic"))

# 2. Get abundance/residuals from pten_patho
patho_df <- merged_df_out %>% filter(pathogenic_status == "Pathogenic") %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"

# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- merged_df_out %>% filter(pathogenic_status != "Pathogenic") 

nrow(non_patho_pool)
## [1] 7054
bootstrap_medians <- vector("numeric", length = n_boot)

# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
  mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))

# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)

# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)

for (i in 1:n_boot) {
  for (j in 1:n_patho) {
    bin_j <- patho_df$bin[j]
    candidates <- bin_lookup[[as.character(bin_j)]]
    if (!is.null(candidates) && length(candidates) > 0) {
      bootstrap_matrix[i, j] <- sample(candidates, 1)
    }
  }
}

# Summarize into a dataframe
boot_df <- data.frame(
  group = "Random abundance-matched",
  residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE)   # Mean across matches per bootstrap
)

# Combine with patho residuals
plot_df <- bind_rows(
  patho_df %>% dplyr::select(group, residuals),
  boot_df
)

label_df <- plot_df %>%
  group_by(group) %>%
  summarise(
    n = n(),
    median_val = median(residuals),
    y_max = max(residuals),
    .groups = "drop"
  )

label_df <- label_df %>%
  mutate(n_label = case_when(
    group == "Random abundance-matched" ~ "bootstrapped 1000 times",
    TRUE ~ paste0("n = ", n)
  ))

# Plot
p_fast2 <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
  geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
    stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
               fill = "black", color = "black", stroke = 0.7) +

 geom_text(
  data = label_df,
  aes(x = group, y = 1.5, label = n_label),
  inherit.aes = FALSE,
  size = 4) +

  geom_text(
    data = label_df,
    aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
    inherit.aes = FALSE,
    size = 6
  ) +
  labs(
    x = "",
    y = "Activity-abundance residual",
    title = "Non-orthosteric variants"
  ) +
  theme_classic(base_size = 14) +
  scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
  theme(legend.position = "none") +
  geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
              map_signif_level = FALSE,
              test = "wilcox.test",
              tip_length = 0.01)


p_fast2

fig5G<- plot_grid(p_fast, p_fast2, ncol=1, nrow=2)
fig5G

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin6.pdf", 
       plot = fig5G, width = 3, height = 6, dpi = 300)